Sample name: SRX1025890_TC32_NT_DRIP.hg38
Sample type: DRIP-Seq
Genome: hg38
Time finished: Thu Jun 10 18:51:30 2021
R-loop broad peaks were called with macs2 and then compared with genomic features using assignGenomeAnnotation from homer.
anno_cats <- data.frame(
"Annotation" = c('Intergenic', 'Simple_repeat', 'Satellite', 'Promoter', 'pseudo', 'Intron', 'TTS', 'LINE', 'LTR', 'SINE', 'DNA', 'CpG-Island', 'ncRNA', 'Low_complexity', 'Exon', 'snRNA', 'Retroposon', '5UTR', '3UTR', 'srpRNA', 'tRNA', 'RC', 'scRNA', 'miRNA', 'RNA', 'snoRNA'),
"annotate_type" = c("gene", "rep", "rep", "gene", "RNA", "gene", "gene",
"rep", "rep", "rep", "rep", "gene", "RNA", "rep",
"gene", "RNA", "rep", "gene", "gene", "RNA",
"RNA", "RNA", "RNA", "RNA", "RNA", "RNA"), stringsAsFactors = FALSE
)
anno_data_plt <- anno_data %>%
dplyr::inner_join(y = anno_cats, by = "Annotation")
minplt <- min(anno_data_plt$`Log2 Ratio (obs/exp)`) * 1.05
maxplt <- max(anno_data_plt$`Log2 Ratio (obs/exp)`) * 1.05
# TODO: Add average to this with SEM
plt1 <- anno_data_plt %>%
dplyr::filter(annotate_type == "gene") %>%
ggplot2::ggplot(mapping = ggplot2::aes(x = factor(.data$Annotation,
levels = c("CpG-Island",
"Promoter",
"5UTR",
"Exon",
"Intron",
"3UTR",
"TTS",
"Intergenic")),
y = .data$`Log2 Ratio (obs/exp)`)) +
ggplot2::geom_col(fill = "firebrick") +
ggplot2::ylab("Log2 Enrichment (obs/exp)") +
ggplot2::xlab(NULL) +
ggplot2::labs(title = "Genomic Features") +
ggplot2::theme_bw() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, vjust = 0.5,
hjust = 1)) +
ggplot2::ylim(minplt, maxplt)
plt2 <- anno_data_plt %>%
dplyr::filter(annotate_type == "RNA") %>%
ggplot2::ggplot(mapping = ggplot2::aes(x = .data$Annotation,
y = .data$`Log2 Ratio (obs/exp)`)) +
ggplot2::geom_col(fill = "goldenrod") +
ggplot2::ylab(NULL) +
ggplot2::xlab("Annotation") +
ggplot2::labs(title = "ncRNAs") +
ggplot2::theme_bw() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, vjust = 0.5,
hjust = 1)) +
ggplot2::ylim(minplt, maxplt)
plt3 <- anno_data_plt %>%
dplyr::filter(annotate_type == "rep") %>%
ggplot2::ggplot(mapping = ggplot2::aes(x = .data$Annotation,
y = .data$`Log2 Ratio (obs/exp)`)) +
ggplot2::geom_col(fill = "forestgreen") +
ggplot2::ylab(NULL) +
ggplot2::xlab(NULL) +
ggplot2::labs(title = "Repetitive Elements") +
ggplot2::theme_bw() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, vjust = 0.5,
hjust = 1)) +
ggplot2::ylim(minplt, maxplt)
gt1 <- ggplot2::ggplotGrob(plt1)
gt2 <- ggplot2::ggplotGrob(plt2)
gt3 <- ggplot2::ggplotGrob(plt3)
if (length(plt2$data$Annotation) && length(plt3$data$Annotation)) {
plts <- cbind(gt1, gt2, gt3, size = "last")
} else if (length(plt2$data$Annotation)) {
plts <- cbind(gt1, gt2, size = "last")
} else {
plts <- gt1
}
grid::grid.newpage()
grid::grid.draw(plts)
RLFS were derived across the genome using QmRLFS-finder.py. R-loop broad peaks were called with macs2 and then compared with RLFS using permTest from the RegioneR R package. An empirical distribution of RLFS was generated using the circularRandomizeRegions method and compared to the peaks in order to calculate enrichment p value and zscore (effect size of enrichment).
pt <- rlfs_data[[1]]
pval <- pt[[1]]$pval
ntimes <- pt[[1]]$ntimes
zscore <- pt[[1]]$zscore
rlfs_pval_color <- ifelse(pval > .05, 'red', ifelse(pval > .01, 'orange', 'green'))
rlfs_zs_color <- ifelse(zscore < 5, 'red', ifelse(zscore < 15, 'orange', 'green'))
lz <- rlfs_data[[2]]
From this analysis, the empirically-determined p value was 0.001996 (with 500 permutations, the minimum possible p value was 0.001996). The enrichment z-score was 26.2923.
plot(pt)
to_plot <- data.frame("zscore" = lz$`regioneR::numOverlaps`$shifted.z.scores,
"shift" = lz$`regioneR::numOverlaps`$shifts)
plt <- ggplot2::ggplot(to_plot, ggplot2::aes(y = .data$zscore,
x = .data$shift)) +
ggplot2::geom_line() +
ggplot2::labs(title = "Peak enrichment around RLFS") +
ggplot2::ylab("Peak Enrichment (Z-Score)") +
ggplot2::xlab("Distance to RLFS (bp)") +
ggplot2::theme_bw(base_size = 15)
plotly::ggplotly(plt)
Using the method described in Chedin et al. 2020, the correlation between SRX1025890_TC32_NT_DRIP.hg38 and the other samples in RMapDB was calculated. The resulting correlation matrix is available in the QC folder.
anno_now <- corr_data$annoNow
sample_name <- configlist$sample_name
anno_now$correlations <- corr_data$corMat[,sample_name]
anno_now$sample_name <- rownames(anno_now)
anno_now <- anno_now[order(anno_now$correlations, decreasing = FALSE), ]
anno_now$rank <- seq(anno_now$sample_name)
plt <- ggplot2::ggplot(anno_now,
mapping = ggplot2::aes(x = .data$rank,
label = .data$sample_name,
y = .data$correlations,
color = .data$Mode)) +
ggplot2::geom_point() +
ggplot2::ylab("Correlation (R)") +
ggplot2::xlab(NULL) +
ggplot2::labs(title = paste0(sample_name, " correlation with RMapDB")) +
ggplot2::theme_bw()
plotly::ggplotly(p = plt)
Correlation heatmap (full-size image in QC folder)
annoNow <- corr_data$annoNow
corMat <- corr_data$corMat
# Make static correlation picture
# From: https://stackoverflow.com/questions/31677923/set-0-point-for-pheatmap-in-r
paletteLength <- 100
myColor <- colorRampPalette(rev(
RColorBrewer::brewer.pal(n = 7, name = "RdBu")))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(min(corMat), 0, length.out=ceiling(paletteLength/2) + 1),
seq(max(corMat)/paletteLength, max(corMat), length.out=floor(paletteLength/2)))
pheatmap::pheatmap(corMat, show_rownames = FALSE,
show_colnames = FALSE,
annotation_colors = list(
Source = c("RMapDB" = "grey", "User-supplied" = "firebrick")
),
color = myColor, breaks = myBreaks,
annotation_col = annoNow)
# TODO: What should this number be? Can we arrive at a more robust estimate?
total_peaks_color <- ifelse(total_peaks < 3000, 'red', ifelse(total_peaks < 6000, 'orange', 'green'))
The peak caller macs was used for peak calling. Number of peaks called: 2836
reads_aligned <- bam_stats$reads_aligned / 1e6
duplicate_reads <- bam_stats$duplicate_reads / 1e6
if (show_readqc) {
total_reads <- bam_stats$total_reads / 1e6
total_reads_color <- "black"
pct_aligned <- round(100*(reads_aligned/total_reads), digits = 2)
pct_align_color <- ifelse(pct_aligned > 80, "green", ifelse(pct_aligned > 60, "orange", "red"))
pct_duplicated <- round(100*(duplicate_reads/total_reads), digits = 2)
pct_duplicated_color <- ifelse(pct_duplicated < 35, "green", ifelse(pct_aligned < 50, "orange", "red"))
} else {
total_reads <- "N/A"
total_reads_color <- "grey"
pct_aligned <- "N/A"
pct_align_color <- "grey"
pct_duplicated <- "N/A"
pct_duplicated_color <- "grey"
}
genome <- configlist$genome
Reads were aligned to hg38 with bwa mem.
Total reads (millions): N/A
Aligned reads (millions): 0.983307
Percent of reads aligned: N/A
Duplicated reads (millions): 0.022535
Percent of reads duplicated: N/A
RSeq © 2021, Bishop Lab, UT Health San Antonio
RSeq maintainer: millerh1@uthscsa.edu